## try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("impute")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(impute)
library(broom)
## Warning: package 'broom' was built under R version 3.4.4
library(mice)
## Loading required package: lattice
library(modelr)
##
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
##
## bootstrap
library(tidyverse)
## -- Attaching packages -------------------------------------------- tidyverse 1.2.1 --
## <U+221A> ggplot2 2.2.1 <U+221A> readr 1.1.1
## <U+221A> tibble 1.4.2 <U+221A> purrr 0.2.4
## <U+221A> tidyr 0.8.0 <U+221A> stringr 1.3.0
## <U+221A> ggplot2 2.2.1 <U+221A> forcats 0.3.0
## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x modelr::bootstrap() masks broom::bootstrap()
## x tidyr::complete() masks mice::complete()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x purrr::transpose() masks data.table::transpose()
library(naniar)
library(visdat)
Read the data and select the required information
houses <- read_csv("Melbourne_housing_FULL.csv") %>%
select(Price, Rooms, Type, Distance, Bedroom2, Bathroom)
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Distance = col_double(),
## CouncilArea = col_character(),
## Lattitude = col_double(),
## Longtitude = col_double(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12094 BuildingArea no trailing characters .3 'Melbourne_housing_FUL~ file 2 12096 BuildingArea no trailing characters .33 'Melbourne_housing_FUL~ row 3 12139 BuildingArea no trailing characters .23 'Melbourne_housing_FUL~ col 4 12223 BuildingArea no trailing characters .51 'Melbourne_housing_FUL~ expected 5 12252 BuildingArea no trailing characters .3 'Melbourne_housing_FUL~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
Summary the situation of missing value
vis_dat(houses, palette = "cb_safe")
vis_miss(houses, sort_miss = TRUE)+theme(aspect.ratio=1)
According to the chart we foud out that the missing value account for 11.5% of the total data. Onlt three variables (Bathroom, Bedroom2, Price) have missing values.
miss_summary(houses)
## # A tibble: 1 x 7
## miss_df_prop miss_var_prop miss_case_prop miss_case_table miss_var_table
## <dbl> <dbl> <dbl> <list> <list>
## 1 0.115 0.667 0.403 <tibble [4 x 3~ <tibble [5 x ~
## # ... with 2 more variables: miss_var_summary <list>,
## # miss_case_summary <list>
Although the missings are small in number (11.5%), there are a huge proportion of missing variables (66.7%). Hence we cannot simply drop the missing values. Imputing the missings is necessary.
Dealing with missing values and data mistakes
rooms_calsulation <- houses %>%
mutate(Trooms = Bedroom2 + Bathroom)
rooms_changing <- rooms_calsulation %>%
mutate(Rooms = if_else(Rooms < !is.na(Trooms),Trooms, Rooms))
new_houses1 <- rooms_changing %>%
select(Price, Rooms, Type, Distance, Bedroom2, Bathroom)
We found that the number of total rooms is not equal to the sum of bathrooms and bedrooms in some cases. So we fixed this problem by replacing the existing data with the sum of number of bathrooms and bedrooms.
names(new_houses1)[2] <- "TotalRooms"
names(new_houses1)[5] <- "Bedrooms"
imputed_houses <- new_houses1 %>%
filter(!is.na(Price)) %>%
mutate(
Distance = if_else(
is.na(Distance), mean(Distance, na.rm = TRUE), Distance),
Bedrooms = if_else(
is.na(Bedrooms), as.integer(median(Bedrooms, na.rm = TRUE)), Bedrooms ),
Bathroom = if_else(
is.na(Bathroom), as.integer(median(Bathroom, na.rm = TRUE)), Bathroom ),
TotalRooms = if_else(is.na(TotalRooms),Bathroom+Bedrooms,TotalRooms),
Type = sub("u", "0", Type),
Type = sub("t", "1", Type),
Type = sub("h", "2", Type) )
colSums(is.na(imputed_houses))
## Price TotalRooms Type Distance Bedrooms Bathroom
## 0 0 0 0 0 0
As we will create a regression for predicting Price useing this dataset, so we drop the missings in Price and impute the missing values in Distance, Bedrooms, Bathroom and TotalRooms. Additionally, Type is changed from a catorgorical vairbale to a numberical variable for regression line prediction. In the final code, we use “colSums” function to make sure there are no missing values in the dataset.
Anaylsing the data by plots
imputed_houses$Price <- as.numeric(imputed_houses$Price)
imputed_houses$TotalRooms <- as.numeric(imputed_houses$TotalRooms)
imputed_houses$Type <- as.integer(imputed_houses$Type)
ggplot(data = imputed_houses,aes(x=TotalRooms,y=Price))+
geom_smooth()
## `geom_smooth()` using method = 'gam'
ggplot(data = imputed_houses,aes(x=TotalRooms,y=log(Price)))+
geom_smooth()
## `geom_smooth()` using method = 'gam'
This plot shows that the number of total rooms strongly effect the price. We also tested the correlation between total rooms and logarithmic price in order to create our regression.
ggplot(imputed_houses)+
geom_smooth(aes(x = Distance,y = log(Price)), se = FALSE,color = "orange")+
facet_wrap(~Type)
## `geom_smooth()` using method = 'gam'
ggplot(imputed_houses)+
geom_smooth(aes(x = Distance,y = Price), se = FALSE, color = "orange")+
facet_wrap(~Type)
## `geom_smooth()` using method = 'gam'
This graph indicates that the distance has a negative impact on the price. We also tested the correlation between distance and logarithmic price in order to create our final regression.
ggplot(imputed_houses,aes(x=Bedrooms,y=Price))+
geom_smooth(color="red")
## `geom_smooth()` using method = 'gam'
ggplot(imputed_houses,aes(x=Bedrooms,y=log(Price)))+
geom_smooth(color="red")
## `geom_smooth()` using method = 'gam'
According to this graph, we can that the number of bedrooms can impact the final price of houses. We also tested the correlation between the number of bedrooms and logarithmic price in order to create our final regression.
ggplot(imputed_houses,aes(x=Bathroom,y=Price))+
geom_smooth(color="purple")
## `geom_smooth()` using method = 'gam'
ggplot(imputed_houses,aes(x=Bathroom,y=log(Price)))+
geom_smooth(color="purple")
## `geom_smooth()` using method = 'gam'
According to this graph, we can that the number of bathrooms can effect the final price of houses. Additionally, We detected the correlation between the number of bedrooms and logarithmic price in order to create our final regression.
tbg1 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg1, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
tbg2 <- filter(imputed_houses, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg2, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
tbg3 <- filter(imputed_houses, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg3, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg4 <- filter(imputed_houses, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg4, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg5 <- filter(imputed_houses, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg5, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg6 <- filter(imputed_houses, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg6, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
tbg7 <- filter(imputed_houses, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg7, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
ggplot(imputed_houses)+
geom_hex(aes(x = Distance,y = log(Price)))+
facet_wrap(~Type)
ggplot(imputed_houses)+
geom_hex(aes(x = Distance,y = Price))+
facet_wrap(~Type)
imputed_houses <- mutate(imputed_houses, logprice = log(Price))
tbg1 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg1, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
tbg2 <- filter(imputed_houses, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg2, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
tbg3 <- filter(imputed_houses, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg3, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg4 <- filter(imputed_houses, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg4, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg5 <- filter(imputed_houses, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg5, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
tbg6 <- filter(imputed_houses, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg6, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
tbg7 <- filter(imputed_houses, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg7, mapping = aes(x = Distance, y = logprice, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
ggplot(imputed_houses)+
geom_hex(aes(x = Distance,y = log(Price)))+
facet_wrap(~Type)
ggplot(imputed_houses)+
geom_hex(aes(x = Distance,y = Price))+
facet_wrap(~Type)
imputed_houses <- imputed_houses %>%
mutate(Type = sub("u", "0", Type),
Type = sub("t", "1", Type),
Type = sub("h", "2", Type))
Establish the regression
sim1 <- lm(Price ~ TotalRooms, data=imputed_houses)
sim2 <- lm(Price ~ Distance, data=imputed_houses)
sim3 <- lm(Price ~ Bedrooms, data=imputed_houses)
sim4 <- lm(Price ~ Bathroom, data=imputed_houses)
sim5 <- lm(Price ~ Type, data=imputed_houses)
glance(sim1)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2164467 0.216418 567827.8 7526.088 0 2 -399671.9 799349.9
## BIC deviance df.residual
## 1 799374.5 8.784562e+15 27245
glance(sim2)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.04468259 0.04464753 626983.7 1274.317 8.333567e-273 2 -402372.2
## AIC BIC deviance df.residual
## 1 804750.4 804775 1.071024e+16 27245
glance(sim3)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.1483859 0.1483546 591975.6 4747.189 0 2 -400806.7 801619.4
## BIC deviance df.residual
## 1 801644.1 9.547605e+15 27245
glance(sim4)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.1566628 0.1566319 589091.8 5061.176 0 2 -400673.7 801353.3
## BIC deviance df.residual
## 1 801377.9 9.45481e+15 27245
glance(sim5)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.1364347 0.1363713 596125.8 2152.139 0 3 -400996.6 802001.1
## BIC deviance df.residual
## 1 802034 9.681592e+15 27244
The first model type we tried is single factor model, we will see the relation ship between price and different factors, seems like in single factor model, Distance model(sim2) has the lowest R^2, which mean distance is the worst variables to fit the pridicting price.
sim11 <- lm(Price ~ TotalRooms+Distance, data=imputed_houses)
sim12 <- lm(Price ~ TotalRooms*Distance, data=imputed_houses)
sim13 <- lm(Price ~ TotalRooms+Bedrooms, data=imputed_houses)
sim14 <- lm(Price ~ TotalRooms*Bedrooms, data=imputed_houses)
sim15 <- lm(Price ~ TotalRooms+Bathroom, data=imputed_houses)
sim16 <- lm(Price ~ TotalRooms*Bathroom, data=imputed_houses)
sim17 <- lm(Price ~ TotalRooms+Type, data=imputed_houses)
sim18 <- lm(Price ~ TotalRooms*Type, data=imputed_houses)
glance(sim11)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.3449257 0.3448776 519201.1 7172.587 0 3 -397232.1 794472.2
## BIC deviance df.residual
## 1 794505.1 7.34416e+15 27244
glance(sim12)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.3691967 0.3691272 509501.3 5314.929 0 4 -396717.8 793445.5
## BIC deviance df.residual
## 1 793486.6 7.072054e+15 27243
glance(sim13)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2164565 0.216399 567834.7 3763.123 0 3 -399671.8 799351.6
## BIC deviance df.residual
## 1 799384.4 8.784452e+15 27244
glance(sim14)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2164565 0.2163702 567845.1 2508.657 0 4 -399671.8 799353.6
## BIC deviance df.residual
## 1 799394.6 8.784452e+15 27243
glance(sim15)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2466267 0.2465714 556795.2 4459.342 0 3 -399136.8 798281.7
## BIC deviance df.residual
## 1 798314.5 8.446208e+15 27244
glance(sim16)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2466318 0.2465489 556803.5 2972.867 0 4 -399136.7 798283.5
## BIC deviance df.residual
## 1 798324.6 8.446151e+15 27243
glance(sim17)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2350628 0.2349786 561062.5 2790.563 0 4 -399344.4 798698.7
## BIC deviance df.residual
## 1 798739.8 8.575853e+15 27243
glance(sim18)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2351641 0.2350237 561045.9 1675.158 0 6 -399342.6 798699.1
## BIC deviance df.residual
## 1 798756.6 8.574718e+15 27241
By adding another factor, the adjusted r^2 of model is significantly improved, and for total rooms and distance, the model with interaction has higher adjusted r^2, therefore total rooms will be times by distance in our final model
sim19 <- lm(Price ~ Distance+Bedrooms, data=imputed_houses)
sim20 <- lm(Price ~ Distance*Bedrooms, data=imputed_houses)
sim21 <- lm(Price ~ Distance+Bathroom, data=imputed_houses)
sim22 <- lm(Price ~ Distance*Bathroom, data=imputed_houses)
sim23 <- lm(Price ~ Distance+Type, data=imputed_houses)
sim24 <- lm(Price ~ Distance*Type, data=imputed_houses)
glance(sim19)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2497995 0.2497444 555621.5 4535.813 0 3 -399079.3 798166.7
## BIC deviance df.residual
## 1 798199.5 8.410637e+15 27244
glance(sim20)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2661513 0.2660705 549542.9 3293.485 0 4 -398779.1 797568.2
## BIC deviance df.residual
## 1 797609.3 8.227315e+15 27243
glance(sim21)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2258794 0.2258226 564409.9 3974.742 0 3 -399506.9 799021.9
## BIC deviance df.residual
## 1 799054.7 8.67881e+15 27244
glance(sim22)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2504733 0.2503908 555382.1 3034.646 0 4 -399067.1 798144.2
## BIC deviance df.residual
## 1 798185.3 8.403083e+15 27243
glance(sim23)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2336836 0.2335992 561568 2769.196 0 4 -399368.9 798747.8
## BIC deviance df.residual
## 1 798788.9 8.591316e+15 27243
glance(sim24)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2505556 0.250418 555372 1821.452 0 6 -399065.6 798145.2
## BIC deviance df.residual
## 1 798202.7 8.402161e+15 27241
In theses 6 models, models with interaction shown a higher adjusted r^2 compared with models without interaction. Therefore, in final model, distance will have interaction with other factors
sim25 <- lm(Price ~ Bedrooms+Bathroom, data=imputed_houses)
sim26 <- lm(Price ~ Bedrooms*Bathroom, data=imputed_houses)
sim27 <- lm(Price ~ Bedrooms+Type, data=imputed_houses)
sim28 <- lm(Price ~ Bedrooms*Type, data=imputed_houses)
sim29 <- lm(Price ~ Bathroom+Type, data=imputed_houses)
sim30 <- lm(Price ~ Bathroom*Type, data=imputed_houses)
glance(sim25)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.1945787 0.1945195 575707.5 3290.887 0 3 -400047 800101.9
## BIC deviance df.residual
## 1 800134.8 9.029729e+15 27244
glance(sim26)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.1945807 0.194492 575717.4 2193.872 0 4 -400046.9 800103.8
## BIC deviance df.residual
## 1 800144.9 9.029706e+15 27243
glance(sim27)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.200891 0.200803 573457.6 2282.907 0 4 -399939.8 799889.5
## BIC deviance df.residual
## 1 799930.6 8.95896e+15 27243
glance(sim28)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2029494 0.2028031 572739.6 1387.251 0 6 -399904.6 799823.2
## BIC deviance df.residual
## 1 799880.7 8.935883e+15 27241
glance(sim29)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2314362 0.2313516 562390.9 2734.545 0 4 -399408.8 798827.6
## BIC deviance df.residual
## 1 798868.7 8.616512e+15 27243
glance(sim30)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.2335036 0.2333629 561654.6 1659.727 0 6 -399372.1 798758.2
## BIC deviance df.residual
## 1 798815.7 8.593334e+15 27241
In these 6 models, there is not significant difference between model with interaction and model without interaction, therefore there will not be interaction between these factors in the final model
sim31 <- lm(Price ~ Distance*TotalRooms+Distance*Bedrooms+Distance*Bathroom+Distance*Type, data=imputed_houses)
sim32 <- lm(Price ~ Distance*TotalRooms*Bedrooms*Bathroom*Type, data=imputed_houses)
sim33 <- lm(Price ~ Distance*TotalRooms*(Bedrooms+Bathroom)*Type, data=imputed_houses)
sim34 <- lm(log(Price) ~ Distance*TotalRooms*Bedrooms*Bathroom*Type, data=imputed_houses)
glance(sim31)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4347131 0.4344848 482388.1 1904.007 0 12 -395223.8 790473.6
## BIC deviance df.residual
## 1 790580.4 6.337537e+15 27235
glance(sim32)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4452829 0.4443243 478173.1 464.5361 0 48 -394966.7 790031.3
## BIC deviance df.residual
## 1 790433.7 6.219037e+15 27199
glance(sim33)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.4424147 0.4416975 479301.9 616.8716 0 36 -395036.9 790147.8
## BIC deviance df.residual
## 1 790451.7 6.251192e+15 27211
glance(sim34)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.5425566 0.5417662 0.3499522 686.3772 0 48 -10029.57
## AIC BIC deviance df.residual
## 1 20157.14 20559.56 3330.967 27199
Based on the results above, the sim34 has the largest adjusted R-squared which is 0.539. Sim34 is the best regression of price.
Summary the final model
summary(sim34)
##
## Call:
## lm(formula = log(Price) ~ Distance * TotalRooms * Bedrooms *
## Bathroom * Type, data = imputed_houses)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.22443 -0.24485 -0.01407 0.22292 2.47430
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 13.805721 0.415854 33.198
## Distance -0.215951 0.043955 -4.913
## TotalRooms -0.125527 0.204015 -0.615
## Bedrooms -0.896634 0.182081 -4.924
## Bathroom -0.880427 0.392874 -2.241
## Type1 0.124199 0.791999 0.157
## Type2 -0.235680 0.426790 -0.552
## Distance:TotalRooms 0.076549 0.019148 3.998
## Distance:Bedrooms 0.108199 0.020943 5.166
## TotalRooms:Bedrooms 0.322259 0.063045 5.112
## Distance:Bathroom 0.133684 0.039717 3.366
## TotalRooms:Bathroom 0.216376 0.184870 1.170
## Bedrooms:Bathroom 0.766044 0.173623 4.412
## Distance:Type1 0.065623 0.079324 0.827
## Distance:Type2 0.171420 0.044664 3.838
## TotalRooms:Type1 0.034648 0.380441 0.091
## TotalRooms:Type2 0.279545 0.206813 1.352
## Bedrooms:Type1 0.503800 0.331563 1.519
## Bedrooms:Type2 0.851587 0.186105 4.576
## Bathroom:Type1 0.374593 0.623293 0.601
## Bathroom:Type2 0.905942 0.396624 2.284
## Distance:TotalRooms:Bedrooms -0.039407 0.006229 -6.327
## Distance:TotalRooms:Bathroom -0.034351 0.015929 -2.157
## Distance:Bedrooms:Bathroom -0.077498 0.019673 -3.939
## TotalRooms:Bedrooms:Bathroom -0.208657 0.053812 -3.878
## Distance:TotalRooms:Type1 -0.019786 0.035600 -0.556
## Distance:TotalRooms:Type2 -0.069262 0.019319 -3.585
## Distance:Bedrooms:Type1 -0.066311 0.033326 -1.990
## Distance:Bedrooms:Type2 -0.103033 0.021179 -4.865
## TotalRooms:Bedrooms:Type1 -0.195566 0.109480 -1.786
## TotalRooms:Bedrooms:Type2 -0.314213 0.063555 -4.944
## Distance:Bathroom:Type1 -0.073268 0.062229 -1.177
## Distance:Bathroom:Type2 -0.131473 0.039886 -3.296
## TotalRooms:Bathroom:Type1 0.016310 0.291169 0.056
## TotalRooms:Bathroom:Type2 -0.148783 0.186069 -0.800
## Bedrooms:Bathroom:Type1 -0.472624 0.284374 -1.662
## Bedrooms:Bathroom:Type2 -0.749141 0.174711 -4.288
## Distance:TotalRooms:Bedrooms:Bathroom 0.021604 0.004886 4.422
## Distance:TotalRooms:Bedrooms:Type1 0.022594 0.010471 2.158
## Distance:TotalRooms:Bedrooms:Type2 0.036933 0.006262 5.898
## Distance:TotalRooms:Bathroom:Type1 0.006793 0.026421 0.257
## Distance:TotalRooms:Bathroom:Type2 0.030979 0.016001 1.936
## Distance:Bedrooms:Bathroom:Type1 0.054173 0.028550 1.897
## Distance:Bedrooms:Bathroom:Type2 0.076970 0.019724 3.902
## TotalRooms:Bedrooms:Bathroom:Type1 0.109128 0.076302 1.430
## TotalRooms:Bedrooms:Bathroom:Type2 0.198599 0.053860 3.687
## Distance:TotalRooms:Bedrooms:Bathroom:Type1 -0.012144 0.007054 -1.722
## Distance:TotalRooms:Bedrooms:Bathroom:Type2 -0.020939 0.004888 -4.284
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## Distance 9.02e-07 ***
## TotalRooms 0.538373
## Bedrooms 8.51e-07 ***
## Bathroom 0.025035 *
## Type1 0.875390
## Type2 0.580804
## Distance:TotalRooms 6.41e-05 ***
## Distance:Bedrooms 2.40e-07 ***
## TotalRooms:Bedrooms 3.22e-07 ***
## Distance:Bathroom 0.000764 ***
## TotalRooms:Bathroom 0.241840
## Bedrooms:Bathroom 1.03e-05 ***
## Distance:Type1 0.408086
## Distance:Type2 0.000124 ***
## TotalRooms:Type1 0.927435
## TotalRooms:Type2 0.176490
## Bedrooms:Type1 0.128656
## Bedrooms:Type2 4.76e-06 ***
## Bathroom:Type1 0.547852
## Bathroom:Type2 0.022371 *
## Distance:TotalRooms:Bedrooms 2.54e-10 ***
## Distance:TotalRooms:Bathroom 0.031052 *
## Distance:Bedrooms:Bathroom 8.19e-05 ***
## TotalRooms:Bedrooms:Bathroom 0.000106 ***
## Distance:TotalRooms:Type1 0.578360
## Distance:TotalRooms:Type2 0.000337 ***
## Distance:Bedrooms:Type1 0.046628 *
## Distance:Bedrooms:Type2 1.15e-06 ***
## TotalRooms:Bedrooms:Type1 0.074060 .
## TotalRooms:Bedrooms:Type2 7.70e-07 ***
## Distance:Bathroom:Type1 0.239049
## Distance:Bathroom:Type2 0.000981 ***
## TotalRooms:Bathroom:Type1 0.955330
## TotalRooms:Bathroom:Type2 0.423941
## Bedrooms:Bathroom:Type1 0.096528 .
## Bedrooms:Bathroom:Type2 1.81e-05 ***
## Distance:TotalRooms:Bedrooms:Bathroom 9.83e-06 ***
## Distance:TotalRooms:Bedrooms:Type1 0.030948 *
## Distance:TotalRooms:Bedrooms:Type2 3.72e-09 ***
## Distance:TotalRooms:Bathroom:Type1 0.797101
## Distance:TotalRooms:Bathroom:Type2 0.052862 .
## Distance:Bedrooms:Bathroom:Type1 0.057775 .
## Distance:Bedrooms:Bathroom:Type2 9.55e-05 ***
## TotalRooms:Bedrooms:Bathroom:Type1 0.152664
## TotalRooms:Bedrooms:Bathroom:Type2 0.000227 ***
## Distance:TotalRooms:Bedrooms:Bathroom:Type1 0.085159 .
## Distance:TotalRooms:Bedrooms:Bathroom:Type2 1.85e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.35 on 27199 degrees of freedom
## Multiple R-squared: 0.5426, Adjusted R-squared: 0.5418
## F-statistic: 686.4 on 47 and 27199 DF, p-value: < 2.2e-16
In “step 4” we have proved that all 5 variable have significant effect on price, so we do not need to drop any variables in our final model. Furthermore, in “step 5” we tried lots of formula to find out the best equation of the regression. Finally, the sim34 comes out to be the best model which has the largest adjusted R-squared.
Predicting the Price
imputed_houses <- add_predictions(imputed_houses, sim34)
tbg21 <- filter(imputed_houses, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg21, mapping = aes(x = Distance, color = TotalRooms))+
geom_line(aes(y = pred), colour = "red")+
geom_point(aes(y = logprice))+
facet_grid(Bathroom ~ Type)
imputed_houses <- add_residuals(imputed_houses, sim34)
ggplot(imputed_houses, aes(resid))+
geom_freqpoly(binwidth = 0.5)
ggplot(imputed_houses, aes(logprice, resid))+
geom_ref_line(h = 0)+
geom_point()
These plots are from data that filled in the pridicted price.
rooms_calsulation <- houses %>%
mutate(special = Bedroom2 + Bathroom)
rooms_changing <- rooms_calsulation %>%
mutate(TotalRooms = if_else(Rooms < !is.na(special),special, Rooms))
houses3 <- rooms_changing %>%
select(Price, TotalRooms, Type, Distance, Bedroom2, Bathroom)
houses4 <- houses3 %>%
mutate(
Distance = if_else(
is.na(Distance), mean(Distance, na.rm = TRUE), Distance),
Bedrooms = if_else(
is.na(Bedroom2), as.integer(median(Bedroom2, na.rm = TRUE)), Bedroom2 ),
Bathroom = if_else(
is.na(Bathroom), as.integer(median(Bathroom, na.rm = TRUE)), Bathroom
))
md.pattern(houses4)
## Warning in data.matrix(x): NAs introduced by coercion
## TotalRooms Distance Bathroom Bedrooms Price Bedroom2 Type
## 20806 1 1 1 1 1 1 0 1
## 5834 1 1 1 1 0 1 0 2
## 6441 1 1 1 1 1 0 0 2
## 1776 1 1 1 1 0 0 0 3
## 0 0 0 0 7610 8217 34857 50684
We run the code again to make sure there is no missings in the prdicted results.
houses4 <- houses4 %>%
mutate(Type = sub("0", "unit_houses", Type),
Type = sub("1", "town_houses", Type),
Type = sub("2", "houses", Type))
tbg21 <- filter(houses4, Bathroom <= 2 & TotalRooms <= 8)
ggplot(tbg21, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 6830 rows containing non-finite values (stat_smooth).
## Warning: Computation failed in `stat_smooth()`:
## x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 6830 rows containing missing values (geom_point).
tbg22 <- filter(houses4, Bathroom == 3 & TotalRooms <= 8)
ggplot(tbg22, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'gam'
## Warning: Removed 643 rows containing non-finite values (stat_smooth).
## Warning: Removed 643 rows containing missing values (geom_point).
tbg23 <- filter(houses4, Bathroom == 4 & TotalRooms <= 8)
ggplot(tbg23, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 92 rows containing non-finite values (stat_smooth).
## Warning: Removed 92 rows containing missing values (geom_point).
tbg24 <- filter(houses4, Bathroom == 5 & TotalRooms <= 8)
ggplot(tbg24, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 31 rows containing non-finite values (stat_smooth).
## Warning: Removed 31 rows containing missing values (geom_point).
tbg25 <- filter(houses4, Bathroom == 6 & TotalRooms <= 8)
ggplot(tbg25, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).
tbg26 <- filter(houses4, Bathroom == 7 & TotalRooms <= 8)
ggplot(tbg26, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 9.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 11.007
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 4.225e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 2 rows containing missing values (geom_point).
tbg27 <- filter(houses4, Bathroom == 8 & TotalRooms <= 8)
ggplot(tbg27, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
tbg28 <- filter(houses4, Bathroom >= 8 & TotalRooms <= 8)
ggplot(tbg28, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 8.8055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 3.025e-05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 1 rows containing missing values (geom_point).
tbg29 <- filter(houses4, TotalRooms >= 8)
ggplot(tbg29, mapping = aes(x = Distance, y = Price, color = TotalRooms))+
geom_point()+
geom_smooth(se = FALSE)+
facet_grid(Bathroom ~ Type)
## `geom_smooth()` using method = 'loess'
## Warning: Removed 10 rows containing non-finite values (stat_smooth).
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 2.574
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 2.574
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.026
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.826
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.000676
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 10.365
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.735
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 45.36
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 4
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : Chernobyl! trL>n 4
## Warning in sqrt(sum.squares/one.delta): NaNs produced
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 10.47
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 3.33
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 7.4529
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 7.6525
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.0022563
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 7.6525
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.0475
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at 17.247
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 0.0022562
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.0022562
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning: Removed 10 rows containing missing values (geom_point).
ggplot(houses4)+
geom_hex(aes(x = Distance,y = log(Price)))+
facet_wrap(~Type)
## Warning: Removed 7610 rows containing non-finite values (stat_binhex).
ggplot(houses4)+
geom_hex(aes(x = Distance,y = Price))+
facet_wrap(~Type)
## Warning: Removed 7610 rows containing non-finite values (stat_binhex).
We make plots for the pridicted data.